In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
import warnings
warnings.filterwarnings("ignore")
In [2]:
df = pd.read_csv("WA_Fn-UseC_-HR-Employee-Attrition.csv" , sep = "," , encoding = 'utf-8')
df
Out[2]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 4 80 1 6 3 3 2 2 2 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 36 No Travel_Frequently 884 Research & Development 23 2 Medical 1 2061 ... 3 80 1 17 3 3 5 2 0 3
1466 39 No Travel_Rarely 613 Research & Development 6 1 Medical 1 2062 ... 1 80 1 9 5 3 7 7 1 7
1467 27 No Travel_Rarely 155 Research & Development 4 3 Life Sciences 1 2064 ... 2 80 1 6 0 3 6 2 0 3
1468 49 No Travel_Frequently 1023 Sales 2 3 Medical 1 2065 ... 4 80 0 17 3 2 9 6 0 8
1469 34 No Travel_Rarely 628 Research & Development 8 3 Medical 1 2068 ... 1 80 0 6 3 4 4 3 1 2

1470 rows × 35 columns

In [3]:
df.shape
Out[3]:
(1470, 35)
In [4]:
df[df.duplicated()==True] #no duplicates
Out[4]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager

0 rows × 35 columns

In [5]:
df.columns
Out[5]:
Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')
In [6]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1470 non-null   int64 
 1   Attrition                 1470 non-null   object
 2   BusinessTravel            1470 non-null   object
 3   DailyRate                 1470 non-null   int64 
 4   Department                1470 non-null   object
 5   DistanceFromHome          1470 non-null   int64 
 6   Education                 1470 non-null   int64 
 7   EducationField            1470 non-null   object
 8   EmployeeCount             1470 non-null   int64 
 9   EmployeeNumber            1470 non-null   int64 
 10  EnvironmentSatisfaction   1470 non-null   int64 
 11  Gender                    1470 non-null   object
 12  HourlyRate                1470 non-null   int64 
 13  JobInvolvement            1470 non-null   int64 
 14  JobLevel                  1470 non-null   int64 
 15  JobRole                   1470 non-null   object
 16  JobSatisfaction           1470 non-null   int64 
 17  MaritalStatus             1470 non-null   object
 18  MonthlyIncome             1470 non-null   int64 
 19  MonthlyRate               1470 non-null   int64 
 20  NumCompaniesWorked        1470 non-null   int64 
 21  Over18                    1470 non-null   object
 22  OverTime                  1470 non-null   object
 23  PercentSalaryHike         1470 non-null   int64 
 24  PerformanceRating         1470 non-null   int64 
 25  RelationshipSatisfaction  1470 non-null   int64 
 26  StandardHours             1470 non-null   int64 
 27  StockOptionLevel          1470 non-null   int64 
 28  TotalWorkingYears         1470 non-null   int64 
 29  TrainingTimesLastYear     1470 non-null   int64 
 30  WorkLifeBalance           1470 non-null   int64 
 31  YearsAtCompany            1470 non-null   int64 
 32  YearsInCurrentRole        1470 non-null   int64 
 33  YearsSinceLastPromotion   1470 non-null   int64 
 34  YearsWithCurrManager      1470 non-null   int64 
dtypes: int64(26), object(9)
memory usage: 402.1+ KB
In [7]:
df.describe().transpose()
Out[7]:
count mean std min 25% 50% 75% max
Age 1470.0 36.923810 9.135373 18.0 30.00 36.0 43.00 60.0
DailyRate 1470.0 802.485714 403.509100 102.0 465.00 802.0 1157.00 1499.0
DistanceFromHome 1470.0 9.192517 8.106864 1.0 2.00 7.0 14.00 29.0
Education 1470.0 2.912925 1.024165 1.0 2.00 3.0 4.00 5.0
EmployeeCount 1470.0 1.000000 0.000000 1.0 1.00 1.0 1.00 1.0
EmployeeNumber 1470.0 1024.865306 602.024335 1.0 491.25 1020.5 1555.75 2068.0
EnvironmentSatisfaction 1470.0 2.721769 1.093082 1.0 2.00 3.0 4.00 4.0
HourlyRate 1470.0 65.891156 20.329428 30.0 48.00 66.0 83.75 100.0
JobInvolvement 1470.0 2.729932 0.711561 1.0 2.00 3.0 3.00 4.0
JobLevel 1470.0 2.063946 1.106940 1.0 1.00 2.0 3.00 5.0
JobSatisfaction 1470.0 2.728571 1.102846 1.0 2.00 3.0 4.00 4.0
MonthlyIncome 1470.0 6502.931293 4707.956783 1009.0 2911.00 4919.0 8379.00 19999.0
MonthlyRate 1470.0 14313.103401 7117.786044 2094.0 8047.00 14235.5 20461.50 26999.0
NumCompaniesWorked 1470.0 2.693197 2.498009 0.0 1.00 2.0 4.00 9.0
PercentSalaryHike 1470.0 15.209524 3.659938 11.0 12.00 14.0 18.00 25.0
PerformanceRating 1470.0 3.153741 0.360824 3.0 3.00 3.0 3.00 4.0
RelationshipSatisfaction 1470.0 2.712245 1.081209 1.0 2.00 3.0 4.00 4.0
StandardHours 1470.0 80.000000 0.000000 80.0 80.00 80.0 80.00 80.0
StockOptionLevel 1470.0 0.793878 0.852077 0.0 0.00 1.0 1.00 3.0
TotalWorkingYears 1470.0 11.279592 7.780782 0.0 6.00 10.0 15.00 40.0
TrainingTimesLastYear 1470.0 2.799320 1.289271 0.0 2.00 3.0 3.00 6.0
WorkLifeBalance 1470.0 2.761224 0.706476 1.0 2.00 3.0 3.00 4.0
YearsAtCompany 1470.0 7.008163 6.126525 0.0 3.00 5.0 9.00 40.0
YearsInCurrentRole 1470.0 4.229252 3.623137 0.0 2.00 3.0 7.00 18.0
YearsSinceLastPromotion 1470.0 2.187755 3.222430 0.0 0.00 1.0 3.00 15.0
YearsWithCurrManager 1470.0 4.123129 3.568136 0.0 2.00 3.0 7.00 17.0
In [8]:
df.describe(include="O")
Out[8]:
Attrition BusinessTravel Department EducationField Gender JobRole MaritalStatus Over18 OverTime
count 1470 1470 1470 1470 1470 1470 1470 1470 1470
unique 2 3 3 6 2 9 3 1 2
top No Travel_Rarely Research & Development Life Sciences Male Sales Executive Married Y No
freq 1233 1043 961 606 882 326 673 1470 1054
In [9]:
#data is data frame with label encoded values 
data=df.copy()
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
cols = ['Attrition', 'BusinessTravel', 'Department',
       'EducationField', 'Gender', 'JobRole', 'MaritalStatus',
       'Over18', 'OverTime']
data[cols] = data[cols].apply(LabelEncoder().fit_transform)
data.head()
Out[9]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 1 2 1102 2 1 2 1 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 0 1 279 1 8 1 1 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 1 2 1373 1 2 2 4 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 0 1 1392 1 3 4 1 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 0 2 591 1 2 1 3 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [10]:
import matplotlib.pyplot as plt

plt.figure(figsize=(7, 7))

# Specify the colors you want to use
colors = ['#FF6B6B', '#6BFFA6']
(data['Attrition'].value_counts()).plot.pie(autopct="%1.1f%%", colors=colors)
plt.title('Attrition percentage')

plt.show()
In [11]:
plt.figure(figsize=(5,5,))
(df['Gender'].value_counts()).plot.pie(autopct = "%1.1f%%", colors=['#1f6eed', '#ed1fc0'])
plt.title('Gender percentage')
Out[11]:
Text(0.5, 1.0, 'Gender percentage')
In [12]:
plt.figure(figsize=(10,5))
sns.countplot(x='BusinessTravel',data=data,palette='Blues')
plt.grid(True)
plt.title('Distrubtion Of BusinessTravel',fontsize=20)
plt.xlabel('BusinessTravel',fontsize=20)
plt.ylabel('Count',fontsize=20)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()
In [13]:
plt.figure(figsize=(10,5))
sns.countplot(x='Department',data=data,palette='Reds')
plt.grid(True)
plt.title('Distrubtion Of Department',fontsize=20)
plt.xlabel('Department',fontsize=20)
plt.ylabel('Count',fontsize=20)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()
In [14]:
pd.crosstab(data.Attrition,data.Department)
Out[14]:
Department 0 1 2
Attrition
0 51 828 354
1 12 133 92
In [15]:
Attrition=df[df['Attrition']=="Yes"]
Attrition
Out[15]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 1 80 0 8 0 1 6 4 0 5
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 2 80 0 7 3 3 0 0 0 0
14 28 Yes Travel_Rarely 103 Research & Development 24 3 Life Sciences 1 19 ... 2 80 0 6 4 3 4 2 0 3
21 36 Yes Travel_Rarely 1218 Sales 9 4 Life Sciences 1 27 ... 2 80 0 10 4 3 5 3 0 3
24 34 Yes Travel_Rarely 699 Research & Development 6 1 Medical 1 31 ... 3 80 0 8 2 3 4 2 1 3
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1438 23 Yes Travel_Frequently 638 Sales 9 3 Marketing 1 2023 ... 1 80 1 1 3 2 1 0 1 0
1442 29 Yes Travel_Rarely 1092 Research & Development 1 4 Medical 1 2027 ... 2 80 3 4 3 4 2 2 2 2
1444 56 Yes Travel_Rarely 310 Research & Development 7 2 Technical Degree 1 2032 ... 4 80 1 14 4 1 10 9 9 8
1452 50 Yes Travel_Frequently 878 Sales 1 4 Life Sciences 1 2044 ... 4 80 2 12 3 3 6 3 0 1
1461 50 Yes Travel_Rarely 410 Sales 28 3 Marketing 1 2055 ... 2 80 1 20 3 3 3 2 2 0

237 rows × 35 columns

In [16]:
Attrition.Department.value_counts()
Out[16]:
Research & Development    133
Sales                      92
Human Resources            12
Name: Department, dtype: int64
In [17]:
import plotly.express as px
Department_count = Attrition['Department'].value_counts()
plt.figure(figsize=(10,10))
fig = px.pie(Attrition, values=Department_count, names=Department_count.index)
fig.update_traces(hoverinfo='label+percent', textfont_size=20,
                  marker=dict(colors=['	#FFB3B3', '	#C1EFFF','#FFDBA4'], line=dict(color='#fafafa', width=2)))

fig.update_traces(hole=0.5, hoverinfo="label+percent+name")
fig.update_layout(annotations=[dict(text='Department', x=0.5, y=0.5, font_size=20, showarrow=False)])
fig.show()
<Figure size 1000x1000 with 0 Axes>
In [18]:
fig = px.ecdf(df, x="MonthlyRate", color="Attrition")
fig.show()
In [19]:
fig = px.histogram(df, x='OverTime'
            ,color='Attrition', barmode="group")

fig.update_layout(template='plotly_white')
fig.show()
In [20]:
fig = px.box(df, x="EnvironmentSatisfaction", y='Attrition')
fig.show()
In [21]:
jobsat= df[(df['JobSatisfaction'] == 2) | (df['JobSatisfaction'] == 3)]
(Attrition['JobSatisfaction'].value_counts()/Attrition.shape[0]*100).plot.bar( color = ["#68228B" , '#79CDCD'  , '#00E5EE', '#FCBAAD'])
plt.title('Attrition job satisfaction')
Out[21]:
Text(0.5, 1.0, 'Attrition job satisfaction')
In [22]:
fig = px.box(df, x="JobSatisfaction", y='Attrition')
fig.show()
In [23]:
df['distance']=df["DistanceFromHome"]
df['distance']= np.where(df['distance'] <= 13, 'Near', 'Far')
df
Out[23]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager distance
0 41 Yes Travel_Rarely 1102 Sales 1 2 Life Sciences 1 1 ... 80 0 8 0 1 6 4 0 5 Near
1 49 No Travel_Frequently 279 Research & Development 8 1 Life Sciences 1 2 ... 80 1 10 3 3 10 7 1 7 Near
2 37 Yes Travel_Rarely 1373 Research & Development 2 2 Other 1 4 ... 80 0 7 3 3 0 0 0 0 Near
3 33 No Travel_Frequently 1392 Research & Development 3 4 Life Sciences 1 5 ... 80 0 8 3 3 8 7 3 0 Near
4 27 No Travel_Rarely 591 Research & Development 2 1 Medical 1 7 ... 80 1 6 3 3 2 2 2 2 Near
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
1465 36 No Travel_Frequently 884 Research & Development 23 2 Medical 1 2061 ... 80 1 17 3 3 5 2 0 3 Far
1466 39 No Travel_Rarely 613 Research & Development 6 1 Medical 1 2062 ... 80 1 9 5 3 7 7 1 7 Near
1467 27 No Travel_Rarely 155 Research & Development 4 3 Life Sciences 1 2064 ... 80 1 6 0 3 6 2 0 3 Near
1468 49 No Travel_Frequently 1023 Sales 2 3 Medical 1 2065 ... 80 0 17 3 2 9 6 0 8 Near
1469 34 No Travel_Rarely 628 Research & Development 8 3 Medical 1 2068 ... 80 0 6 3 4 4 3 1 2 Near

1470 rows × 36 columns

In [24]:
sns.kdeplot(
   data=df, x="RelationshipSatisfaction", hue="distance",
   fill=True, common_norm=False, palette=['red','black'],
   alpha=.5, linewidth=0,
)
Out[24]:
<Axes: xlabel='RelationshipSatisfaction', ylabel='Density'>
In [25]:
df.Age.describe()
Out[25]:
count    1470.000000
mean       36.923810
std         9.135373
min        18.000000
25%        30.000000
50%        36.000000
75%        43.000000
max        60.000000
Name: Age, dtype: float64
In [26]:
sns.stripplot(data=df, y="Age", hue="Gender",
              x="Attrition",alpha=0.3, orient="v")
Out[26]:
<Axes: xlabel='Attrition', ylabel='Age'>
In [27]:
g=df[df['Attrition']=='Yes']
g2=df[df['Attrition']=='No']
fig, axs = plt.subplots(2, 2, figsize=(7, 7))
sns.kdeplot(x = g['MonthlyIncome'],
            fill = True, color = "blue", alpha = 0.5, ax=axs[1, 0]). set_xlabel('MonthlyIncome attrition')
sns.kdeplot(x = g2['MonthlyIncome'],
            fill = True, color = "black", alpha = 0.5, ax=axs[1, 1]). set_xlabel('MonthlyIncome stay')
sns.kdeplot(x = df['MonthlyIncome'],
            fill = False, alpha = 0.5, ax=axs[0, 0]). set_xlabel('MonthlyIncome total')
sns.kdeplot(x = df['MonthlyIncome'],hue=df['Attrition'],
            fill = False, alpha = 0.5, ax=axs[0, 1]). set_xlabel('MonthlyIncome hue')
Out[27]:
Text(0.5, 0, 'MonthlyIncome hue')
In [28]:
count_data = df.groupby(['StockOptionLevel', 'Attrition']).size().reset_index(name='Count')
px.line(count_data, x='StockOptionLevel', y='Count', color='Attrition', markers=True)
In [29]:
data2=data.copy()
label=LabelEncoder()
for x in data2.select_dtypes(include='object').columns:
    data2[x]=label.fit_transform(data2[x])
data2.head()
Out[29]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 1 2 1102 2 1 2 1 1 1 ... 1 80 0 8 0 1 6 4 0 5
1 49 0 1 279 1 8 1 1 1 2 ... 4 80 1 10 3 3 10 7 1 7
2 37 1 2 1373 1 2 2 4 1 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 0 1 1392 1 3 4 1 1 5 ... 3 80 0 8 3 3 8 7 3 0
4 27 0 2 591 1 2 1 3 1 7 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 35 columns

In [30]:
plt.figure(figsize=(20,10))
sns.heatmap(data2.corr(),annot=True,fmt='.2',cbar=False,cmap='Blues_r')
data2.corr()
Out[30]:
Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
Age 1.000000 -0.159205 0.024751 0.010661 -0.031882 -0.001686 0.208034 -0.040873 NaN -0.010145 ... 0.053535 NaN 0.037510 0.680381 -0.019621 -0.021490 0.311309 0.212901 0.216513 0.202089
Attrition -0.159205 1.000000 0.000074 -0.056652 0.063991 0.077924 -0.031373 0.026846 NaN -0.010577 ... -0.045872 NaN -0.137145 -0.171063 -0.059478 -0.063939 -0.134392 -0.160545 -0.033019 -0.156199
BusinessTravel 0.024751 0.000074 1.000000 -0.004086 -0.009044 -0.024469 0.000757 0.023724 NaN -0.015578 ... -0.035986 NaN -0.016727 0.034226 0.015240 -0.011256 -0.014575 -0.011497 -0.032591 -0.022636
DailyRate 0.010661 -0.056652 -0.004086 1.000000 0.007109 -0.004985 -0.016806 0.037709 NaN -0.050990 ... 0.007846 NaN 0.042143 0.014515 0.002453 -0.037848 -0.034055 0.009932 -0.033229 -0.026363
Department -0.031882 0.063991 -0.009044 0.007109 1.000000 0.017225 0.007996 0.013720 NaN -0.010895 ... -0.022414 NaN -0.012193 -0.015762 0.036875 0.026383 0.022920 0.056315 0.040061 0.034282
DistanceFromHome -0.001686 0.077924 -0.024469 -0.004985 0.017225 1.000000 0.021042 0.002013 NaN 0.032916 ... 0.006557 NaN 0.044872 0.004628 -0.036942 -0.026556 0.009508 0.018845 0.010029 0.014406
Education 0.208034 -0.031373 0.000757 -0.016806 0.007996 0.021042 1.000000 -0.039592 NaN 0.042070 ... -0.009118 NaN 0.018422 0.148280 -0.025100 0.009819 0.069114 0.060236 0.054254 0.069065
EducationField -0.040873 0.026846 0.023724 0.037709 0.013720 0.002013 -0.039592 1.000000 NaN -0.002516 ... -0.004378 NaN -0.016185 -0.027848 0.049195 0.041191 -0.018692 -0.010506 0.002326 -0.004130
EmployeeCount NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
EmployeeNumber -0.010145 -0.010577 -0.015578 -0.050990 -0.010895 0.032916 0.042070 -0.002516 NaN 1.000000 ... -0.069861 NaN 0.062227 -0.014365 0.023603 0.010309 -0.011240 -0.008416 -0.009019 -0.009197
EnvironmentSatisfaction 0.010146 -0.103369 0.004174 0.018355 -0.019395 -0.016075 -0.027128 0.043163 NaN 0.017621 ... 0.007665 NaN 0.003432 -0.002693 -0.019359 0.027627 0.001458 0.018007 0.016194 -0.004999
Gender -0.036311 0.029453 -0.032981 -0.011716 -0.041583 -0.001851 -0.016547 -0.002504 NaN 0.022556 ... 0.022868 NaN 0.012716 -0.046881 -0.038787 -0.002753 -0.029747 -0.041483 -0.026985 -0.030599
HourlyRate 0.024287 -0.006846 0.026528 0.023381 -0.004144 0.031131 0.016775 -0.021941 NaN 0.035179 ... 0.001330 NaN 0.050263 -0.002334 -0.008548 -0.004607 -0.019582 -0.024106 -0.026716 -0.020123
JobInvolvement 0.029820 -0.130016 0.039062 0.046135 -0.024586 0.008783 0.042438 -0.002655 NaN -0.006888 ... 0.034297 NaN 0.021523 -0.005533 -0.015338 -0.014617 -0.021355 0.008717 -0.024184 0.025976
JobLevel 0.509604 -0.169105 0.019311 0.002966 0.101963 0.005303 0.101589 -0.044933 NaN -0.018519 ... 0.021642 NaN 0.013984 0.782208 -0.018191 0.037818 0.534739 0.389447 0.353885 0.375281
JobRole -0.122427 0.067151 0.002724 -0.009472 0.662431 -0.001015 0.004236 0.015599 NaN -0.010336 ... -0.020218 NaN -0.019171 -0.145439 0.001342 0.027764 -0.083657 -0.028354 -0.046384 -0.041150
JobSatisfaction -0.004892 -0.103481 -0.033962 0.030571 0.021001 -0.003669 -0.011296 -0.034401 NaN -0.046247 ... -0.012454 NaN 0.010690 -0.020185 -0.005779 -0.019459 -0.003803 -0.002305 -0.018214 -0.027656
MaritalStatus -0.095029 0.162070 0.024001 -0.069586 0.056073 -0.014437 0.004053 0.014420 NaN -0.008155 ... 0.022549 NaN -0.662577 -0.077886 0.010629 0.014708 -0.059986 -0.065822 -0.030915 -0.038570
MonthlyIncome 0.497855 -0.159840 0.034319 0.007707 0.053130 -0.017014 0.094961 -0.041070 NaN -0.014829 ... 0.025873 NaN 0.005408 0.772893 -0.021736 0.030683 0.514285 0.363818 0.344978 0.344079
MonthlyRate 0.028051 0.015170 -0.014107 -0.032182 0.023642 0.027473 -0.026084 -0.027182 NaN 0.012648 ... -0.004085 NaN -0.034323 0.026442 0.001467 0.007963 -0.023655 -0.012815 0.001567 -0.036746
NumCompaniesWorked 0.299635 0.043494 0.020875 0.038153 -0.035882 -0.029251 0.126317 -0.008663 NaN -0.001251 ... 0.052733 NaN 0.030075 0.237639 -0.066054 -0.008366 -0.118421 -0.090754 -0.036814 -0.110319
Over18 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
OverTime 0.028062 0.246118 0.016543 0.009135 0.007481 0.025514 -0.020322 0.002259 NaN -0.024037 ... 0.048493 NaN -0.000449 0.012754 -0.079113 -0.027092 -0.011687 -0.029758 -0.012239 -0.041586
PercentSalaryHike 0.003634 -0.013478 -0.029377 0.022704 -0.007840 0.040235 -0.011111 -0.011214 NaN -0.012944 ... -0.040490 NaN 0.007528 -0.020608 -0.005221 -0.003280 -0.035991 -0.001520 -0.022154 -0.011985
PerformanceRating 0.001904 0.002889 -0.026341 0.000473 -0.024604 0.027110 -0.024539 -0.005614 NaN -0.020359 ... -0.031351 NaN 0.003506 0.006744 -0.015579 0.002572 0.003435 0.034986 0.017896 0.022827
RelationshipSatisfaction 0.053535 -0.045872 -0.035986 0.007846 -0.022414 0.006557 -0.009118 -0.004378 NaN -0.069861 ... 1.000000 NaN -0.045952 0.024054 0.002497 0.019604 0.019367 -0.015123 0.033493 -0.000867
StandardHours NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
StockOptionLevel 0.037510 -0.137145 -0.016727 0.042143 -0.012193 0.044872 0.018422 -0.016185 NaN 0.062227 ... -0.045952 NaN 1.000000 0.010136 0.011274 0.004129 0.015058 0.050818 0.014352 0.024698
TotalWorkingYears 0.680381 -0.171063 0.034226 0.014515 -0.015762 0.004628 0.148280 -0.027848 NaN -0.014365 ... 0.024054 NaN 0.010136 1.000000 -0.035662 0.001008 0.628133 0.460365 0.404858 0.459188
TrainingTimesLastYear -0.019621 -0.059478 0.015240 0.002453 0.036875 -0.036942 -0.025100 0.049195 NaN 0.023603 ... 0.002497 NaN 0.011274 -0.035662 1.000000 0.028072 0.003569 -0.005738 -0.002067 -0.004096
WorkLifeBalance -0.021490 -0.063939 -0.011256 -0.037848 0.026383 -0.026556 0.009819 0.041191 NaN 0.010309 ... 0.019604 NaN 0.004129 0.001008 0.028072 1.000000 0.012089 0.049856 0.008941 0.002759
YearsAtCompany 0.311309 -0.134392 -0.014575 -0.034055 0.022920 0.009508 0.069114 -0.018692 NaN -0.011240 ... 0.019367 NaN 0.015058 0.628133 0.003569 0.012089 1.000000 0.758754 0.618409 0.769212
YearsInCurrentRole 0.212901 -0.160545 -0.011497 0.009932 0.056315 0.018845 0.060236 -0.010506 NaN -0.008416 ... -0.015123 NaN 0.050818 0.460365 -0.005738 0.049856 0.758754 1.000000 0.548056 0.714365
YearsSinceLastPromotion 0.216513 -0.033019 -0.032591 -0.033229 0.040061 0.010029 0.054254 0.002326 NaN -0.009019 ... 0.033493 NaN 0.014352 0.404858 -0.002067 0.008941 0.618409 0.548056 1.000000 0.510224
YearsWithCurrManager 0.202089 -0.156199 -0.022636 -0.026363 0.034282 0.014406 0.069065 -0.004130 NaN -0.009197 ... -0.000867 NaN 0.024698 0.459188 -0.004096 0.002759 0.769212 0.714365 0.510224 1.000000

35 rows × 35 columns

In [31]:
X=data2.drop('Attrition',axis=1)
y=data2['Attrition']
key=X.keys()
X.head()
Out[31]:
Age BusinessTravel DailyRate Department DistanceFromHome Education EducationField EmployeeCount EmployeeNumber EnvironmentSatisfaction ... RelationshipSatisfaction StandardHours StockOptionLevel TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
0 41 2 1102 2 1 2 1 1 1 2 ... 1 80 0 8 0 1 6 4 0 5
1 49 1 279 1 8 1 1 1 2 3 ... 4 80 1 10 3 3 10 7 1 7
2 37 2 1373 1 2 2 4 1 4 4 ... 2 80 0 7 3 3 0 0 0 0
3 33 1 1392 1 3 4 1 1 5 4 ... 3 80 0 8 3 3 8 7 3 0
4 27 2 591 1 2 1 3 1 7 1 ... 4 80 1 6 3 3 2 2 2 2

5 rows × 34 columns

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle =True)
print('X_train shape is ' , X_train.shape)
print('X_test shape is ' , X_test.shape)
print('y_train shape is ' , y_train.shape)
print('y_test shape is ' , y_test.shape)
X_train shape is  (1176, 34)
X_test shape is  (294, 34)
y_train shape is  (1176,)
y_test shape is  (294,)
In [33]:
random=Pipeline([
                    ('min_max',MinMaxScaler()),
                    ('model',RandomForestClassifier(criterion = 'gini',n_estimators=100,max_depth=10,random_state=44))
                    ])
random.fit(X_train,y_train)
Out[33]:
Pipeline(steps=[('min_max', MinMaxScaler()),
                ('model',
                 RandomForestClassifier(max_depth=10, random_state=44))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('min_max', MinMaxScaler()),
                ('model',
                 RandomForestClassifier(max_depth=10, random_state=44))])
MinMaxScaler()
RandomForestClassifier(max_depth=10, random_state=44)
In [34]:
print('RandomForestClassifierModel Train Score is : ' ,random.score(X_train, y_train))
print('RandomForestClassifierModel Test Score is : ' ,random.score(X_test, y_test))
RandomForestClassifierModel Train Score is :  0.9795918367346939
RandomForestClassifierModel Test Score is :  0.8707482993197279
In [35]:
svc=Pipeline([
                  ('min_max',MinMaxScaler()),
                  ('model',SVC(kernel= 'rbf',max_iter=100,C=2.0,gamma='auto'))
                    ])
svc.fit(X_train,y_train)
Out[35]:
Pipeline(steps=[('min_max', MinMaxScaler()),
                ('model', SVC(C=2.0, gamma='auto', max_iter=100))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('min_max', MinMaxScaler()),
                ('model', SVC(C=2.0, gamma='auto', max_iter=100))])
MinMaxScaler()
SVC(C=2.0, gamma='auto', max_iter=100)
In [36]:
print('SVCModel Train Score is : ' ,svc.score(X_train, y_train))
print('SVCModel Test Score is : '  , svc.score(X_test, y_test))
SVCModel Train Score is :  0.5008503401360545
SVCModel Test Score is :  0.5136054421768708

Thank You¶

In [ ]: